# Importing a geographic health disparities data set from github repo
urlfile = 'https://raw.githubusercontent.com/eitanaka/DATS6101_Project1_Team2/main/dataset/geographic_health_disparities.csv'
geo_health_df <- read_csv(url(urlfile))
(need edited)
Familiarize yourself with the data by looking at the data type, variable names, and number of observations. Understand what each variable represents and what type of analysis you may need to perform.
# Look at the datatypes
str(geo_health_df)
# A number of observation
nrow(geo_health_df)
# Understand what each variable represents and what type of analysis you may need to perform.
#
Missing data can cause issues in the analysis. Check for missing values and decide how to handle them (either by imputing or dropping them).
# check for the missing value
sum(is.na(geo_health_df))
Outliers can skew the analysis and affect the model’s performance. Identify and understand the presence of any outliers and determine whether to remove them or not.
Understand the distribution of each variable in the dataset. Identify whether the data is normally distributed or skewed, and determine whether transformations are necessary.
# check for outliers
outlier_Depression <- outlierKD2(geo_health_df, geo_health_df$Data_Value.DEPRESSION)
outlier_MLHTH <- outlierKD2(geo_health_df, geo_health_df$Data_Value.MHLTH)
outlier_SLEEP <- outlierKD2(geo_health_df, geo_health_df$Data_Value.SLEEP)
outlier_LPA <- outlierKD2(geo_health_df, geo_health_df$Data_Value.LPA)
To understand the central tendency, variability, and spread of the data.
# Create a list containing the data frames by each state
data_by_state <- split(geo_health_df, geo_health_df$StateAbbr)
# Summary statistic about percentage of population affected with depression in each state
summary_by_state_DEPRESSION <- lapply(data_by_state, function(x) summary(x$Data_Value.DEPRESSION))
head(summary_by_state_DEPRESSION,3)
tail(summary_by_state_DEPRESSION,3)
# A list of standard deviation for percentage of population affected with Depression in each state
sd_by_state_DEPRESSION<- lapply(data_by_state, function(x) sd(x$Data_Value.DEPRESSION))
head(sd_by_state_DEPRESSION,3)
tail(sd_by_state_DEPRESSION,3)
# A data frame containing mean values of the incidence rate of Depression for each country in each state
mean_by_state_DEPRESSION <- lapply(data_by_state, function(x) mean(x$Data_Value.DEPRESSION))
mean_df_DEPRESSION <-data.frame(State = names(mean_by_state_DEPRESSION), Mean_Depression=unlist(mean_by_state_DEPRESSION))
mean_df_DEPRESSION["fips"] <- fips(mean_df_DEPRESSION$State)
# A map of the U.S. showing average percent of people suffering from DEPRESSION for each country in each state
plot_usmap(data=mean_df_DEPRESSION, values="Mean_Depression", labels = TRUE) +
scale_fill_continuous(low = "white", high = "red", guide = FALSE) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) +
ggtitle("Mean Depression by State") +
guides(fill = guide_colorbar(title = "Mean %",
title.position = "top",
title.hjust = 0.5,
label.position = "left",
label.hjust = 0.5))
# Summary of the percentage of the population in each country in each state with a health status of 14 or more days with poor mental health
summary_by_state_MHLTH <- lapply(data_by_state, function(x) summary(x$Data_Value.MHLTH))
head(summary_by_state_MHLTH,3)
## $AK
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.1 12.1 12.9 13.1 13.8 21.2
##
## $AL
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.9 15.6 16.9 16.9 18.1 30.0
##
## $AR
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.8 15.8 16.8 16.8 17.9 24.2
tail(summary_by_state_MHLTH,3)
## $WI
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.8 12.6 13.4 13.8 14.6 26.0
##
## $WV
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.0 17.3 18.4 18.6 19.6 29.5
##
## $WY
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.0 12.3 13.2 13.3 14.1 21.6
# The list containing standard deviation per state: The percentage of the population in each country with a health status of 14 or more day with poor mental health
sd_by_state_MHLTH <- lapply(data_by_state, function(x) sd(x$Data_Value.MHLTH))
head(sd_by_state_MHLTH,3)
## $AK
## [1] 1.96
##
## $AL
## [1] 2.26
##
## $AR
## [1] 1.93
tail(sd_by_state_MHLTH,3)
## $WI
## [1] 2.2
##
## $WV
## [1] 2.02
##
## $WY
## [1] 1.61
# The data frame containing mean value per state: Percentage of the population in each country with poor mental health for 14 or more days as a health status.
mean_by_state_MHLTH <- lapply(data_by_state, function(x) mean(x$Data_Value.MHLTH))
mean_df_MHLTH <- data.frame(State = names(mean_by_state_MHLTH), Mean_MHLTH = unlist(mean_by_state_MHLTH))
mean_df_MHLTH["fips"] <- fips(mean_df_MHLTH$State)
# Map of the United States plotting the mean percent incidence of the population with poor mental health by state.
plot_usmap(data=mean_df_MHLTH, values="Mean_MHLTH", labels = TRUE) +
scale_fill_continuous(low = "white", high = "green", guide = FALSE) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) +
ggtitle("Mean Mental Health by State") +
guides(fill = guide_colorbar(title = " Mean %",
title.position = "top",
title.hjust = 0.5,
label.position = "left",
label.hjust = 0.5))
# A list of summary statistic per state: Each data is a percentage of sleep disturbance in each country
summary_by_state_SLEEP <- lapply(data_by_state, function(x) summary(x$Data_Value.SLEEP))
head(summary_by_state_SLEEP,3)
## $AK
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.8 30.6 32.2 32.2 33.5 39.3
##
## $AL
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.0 36.0 38.7 39.4 42.2 54.2
##
## $AR
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.2 32.1 34.0 34.8 36.4 47.7
tail(summary_by_state_SLEEP,3)
## $WI
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.1 29.0 30.6 31.0 32.2 46.2
##
## $WV
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.3 37.9 39.7 39.9 41.9 50.1
##
## $WY
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.5 29.8 30.8 31.2 33.0 37.4
# A list of standard diviation per state: Each data is a percentage of sleep disturbance in each country
sd_by_state_SLEEP <- lapply(data_by_state, function(x) sd(x$Data_Value.SLEEP))
head(sd_by_state_SLEEP,3)
## $AK
## [1] 2.51
##
## $AL
## [1] 4.92
##
## $AR
## [1] 4.02
tail(sd_by_state_SLEEP,3)
## $WI
## [1] 3.95
##
## $WV
## [1] 2.96
##
## $WY
## [1] 2.29
# A data frame about mean value per state: Each data is a percentage of sleep disturbance in each country
mean_by_state_SLEEP <- lapply(data_by_state, function(x) mean(x$Data_Value.SLEEP))
mean_df_SLEEP <- data.frame(State = names(mean_by_state_SLEEP), Mean_Sleep = unlist(mean_by_state_SLEEP))
mean_df_SLEEP["fips"] <- fips(mean_df_SLEEP$State)
# Map of the United States plotting the mean percent incidence of the population with poor sleep by state.
plot_usmap(data=mean_df_SLEEP, values="Mean_Sleep", labels = TRUE) +
scale_fill_continuous(low = "white", high = "purple", guide = FALSE) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) +
ggtitle("Mean Sleep by State") +
guides(fill = guide_colorbar(title = "Mean %",
title.position = "top",
title.hjust = 0.5,
label.position = "left",
label.hjust = 0.5))
# A list of summary statistic per state: Data are percent of population of people without leisure time in each country
summary_by_state_LPA <- lapply(data_by_state, function(x) summary(x$Data_Value.LPA))
head(summary_by_state_LPA,3)
tail(summary_by_state_LPA,3)
# A list of standard deviation per state: data are percent of population of people without leisure time in each country
sd_by_state_LPA <- lapply(data_by_state, function(x) sd(x$Data_Value.LPA))
head(sd_by_state_LPA,3)
tail(sd_by_state_LPA,3)
# A data frame about mean value per state: data are percent of population of people without leisure time in each country
mean_by_state_LPA <- lapply(data_by_state, function(x) mean(x$Data_Value.LPA))
mean_df_LPA <- data.frame(State = names(mean_by_state_LPA), Mean_LPA = unlist(mean_by_state_LPA))
mean_df_LPA["fips"] <- fips(mean_df_LPA$State)
# Map of the United States plotting the mean percent incidence of the population without leisure time by state.
plot_usmap(data=mean_df_LPA, values="Mean_LPA", labels = TRUE) +
scale_fill_continuous(low = "white", high = "cyan", guide = FALSE) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) +
ggtitle("Mean Lack of Physial Activity by State") +
guides(fill = guide_colorbar(title = "Mean %",
title.position = "top",
title.hjust = 0.5,
label.position = "left",
label.hjust = 0.5))
# create mean_df for all four conditions
mean_df <- merge(merge(mean_df_DEPRESSION, mean_df_MHLTH, by=c("State", "fips")), mean_df_SLEEP, by=c("State", "fips"))
mean_df <- merge(mean_df, mean_df_LPA, by=c("State", "fips"))
colnames(mean_df)[3:6] <- c("Depression", "MHLTH", "Sleep", "LPA")
head(mean_df)
# create a long format of the data
mean_df_long <- gather(mean_df, key = "Variable", value = "Value", -State, -fips)
# group data by Variable and get the max and min values for each group
max_min_df <- mean_df_long %>% group_by(Variable) %>%
slice(which.max(Value), which.min(Value)) %>% ungroup()
## create a box plot for each variable and facet by variable
ggplot(mean_df_long, aes(x = "", y = Value, fill = Variable)) +
geom_boxplot() +
geom_jitter(aes(color = Variable), width = 0.2, size = 2) +
facet_wrap(~Variable, ncol = 4, scales = "fixed") +
scale_fill_manual(values = c("pink","lightgreen", rgb(200, 162, 200, maxColorValue = 255), "cyan")) +
scale_color_manual(values = c("darkred","darkgreen", "darkorchid", "darkblue")) +
labs(title = "Box plots of population proportions for each Measure", x="Health conditions", y = "Population proportion") +
geom_text(data = max_min_df, aes(x = 1.25, y = Value, label = State), size = 4, fontface = "bold", hjust = -0.2, color = "black") # add state labels for max and min values
This code will create a table showing the mean, median, and standard deviation for each variable (depression, sleep, and mental health) across all states, grouped by state names or FIPS codes. You can compare the results to see how each variable varies across different states.
# Compute the correlation matrix
# create national correlation matrix
cor_matrix <- cor(mean_df[ , c("Depression", "MHLTH", "Sleep", "LPA")])
# create national correlation heat map
corrplot(cor_matrix, method = "color")
# create mixed national correlation heat map
corrplot.mixed(cor_matrix)
# Create a data frame containing state, FIPs code, cor_MHLTH_sleep, and cor_MHLTH_LPA
cor_by_state_matrix <- lapply(data_by_state, function(state) cor(state[c("Data_Value.DEPRESSION", "Data_Value.MHLTH", "Data_Value.SLEEP", "Data_Value.LPA")]))
cor_by_state_df <- data.frame(
state = names(cor_by_state_matrix),
MHLTH_SLEEP = sapply(cor_by_state_matrix, function(state) state[2,3]),
MHLTH_LPA = sapply(cor_by_state_matrix, function(state) state[2,4])
)
cor_by_state_df["fips"] <- fips(cor_by_state_df$state)
# The map of US about correlation between MHLTH and Sleep
plot_usmap(data=cor_by_state_df, values="MHLTH_SLEEP", labels = TRUE) +
scale_fill_continuous(low = "white", high = "blue", guide = FALSE) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) +
ggtitle("Correlation Betweeen Mental Health and SLEEP") +
guides(fill = guide_colorbar(title = "Correlation",
title.position = "top",
title.hjust = 0.5,
label.position = "left",
label.hjust = 0.5))
# The map of US about correlation between MHLTH and LPA
plot_usmap(data=cor_by_state_df, values="MHLTH_LPA", labels = TRUE) +
scale_fill_continuous(low = "white", high = "blue", guide = FALSE) +
scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0)) +
ggtitle("Correlation Betweeen Mental Health and Sleep") +
guides(fill = guide_colorbar(title = "Correlation",
title.position = "top",
title.hjust = 0.5,
label.position = "left",
label.hjust = 0.5))
#Linear regression
# data frame is named 'mean_df'
Linear_reg <- lm(Depression ~ Sleep + LPA, data = mean_df)
summary(Linear_reg)
Linear_reg <- lm(MHLTH ~ Sleep + LPA, data = mean_df)
summary(Linear_reg)
We failed to reject depression and there is a significant test for mental health . that means mental health is effected by sleep and LPA.
# create scatter plots
# sleep scatter plot
ggplot(mean_df, aes(x = Sleep)) +
geom_point(aes(y = Depression, color = "Depression")) +
geom_point(aes(y = MHLTH, color = "MHLTH")) +
scale_color_manual(name = "Health Outcomes", values = c("Depression" = "red", "MHLTH" = "green")) +
labs(x = "% Lacking Sleep",
y = "% With Health Outcomes",
title = "Correlation Between Lack of Sleep and Health Outcomes")
# lpa scatter plot
ggplot(mean_df, aes(x = LPA)) +
geom_point(aes(y = Depression, color = "Depression")) +
geom_point(aes(y = MHLTH, color = "MHLTH")) +
scale_color_manual(name = "Health Outcomes", values = c("Depression" = "red", "MHLTH" = "green")) +
labs(x = "% Lacking Physical Activity",
y = "% With Health Outcomes",
title = "Correlation Between Lack of Physical Activity and Health Outcomes")